Introduction
In this project, we target the headlines of “The Guardian” newspaper between 18. October 2021 and 28. November 2021 to apply a sentiment analysis around the negotiations of COP26 that took place in Glasgow from October 31st and November 12.
Being a newspaper of the host country of the climate negotiation, the guardian would not present an ideal sample of headlines that would allow us to deduce if COP26 has met the expectations or not through the sentiment analysis. However, the values of this project is to apply procedures of sentiment analysis after scraping information from the web and present them to the user in an accessible format.


  1. Packages

if (!require("pacman")) install.packages("pacman")
## Caricamento del pacchetto richiesto: pacman
if (!require("phantomjs")) webshot::install_phantomjs()
## Caricamento del pacchetto richiesto: phantomjs
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : non c'è alcun pacchetto chiamato 'phantomjs'
## It seems that the version of `phantomjs` installed is greater than or equal to the requested version.To install the requested version or downgrade to another version, use `force = TRUE`.
pacman::p_load(tidyverse, 
               rvest, 
               stringr, 
               xml2, 
               quanteda, 
               janitor, 
               tidytext, 
               lubridate, 
               wordcloud, 
               SnowballC, 
               dygraphs, 
               xts, 
               tsbox,
               tm, 
               kableExtra,
               magick,
               textdata, 
               plotly)

Scraping Headlines by Date and Data Cleaning To examine this step in details, check the attached scraping.R file on https://github.com/lauramenicacci/cop26_text_analysis/blob/e3697a69534a5e9d540bfc1913f52bd096de99ee/scraping.R

source("./scraping_and_data_cleaning.R")
## Warning: 20 failed to parse.
## Joining, by = "word"
  1. Explorative analysis

3.1. Frequency Table

words_list %>% 
      count(word, sort = TRUE) %>% 
      head(10) %>% 
      kbl() %>% 
      kable_styling(bootstrap_options = c("hover", "condensed", "responsive"), full_width = F, fixed_thead = T) %>% 
      save_kable("FrequencyTable.html")

3.2. Explorative Wordcloud

#explorative wordcloud: we see the most frequent words: if there's something useless we remove it 
par(mar=c(0.1,0.1,0.1,0.1))

wordcloud(words_list$word, 
          min.freq = 20, 
          random.order = FALSE, 
          colors=brewer.pal(8, "Dark2"))
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents

Exploring the most frequent words helps to remove the irrelevant ones among them by adding them to the stopwords dictionary i.e. Glasgow.

Check scraping file https://github.com/lauramenicacci/cop26_text_analysis/blob/e3697a69534a5e9d540bfc1913f52bd096de99ee/scraping.R

3.3. Keyword in Context

cop26$Headlines %>% 
      corpus() %>% 
      tokens() %>% 
      kwic(phrase("crisis"), window = 10) %>% 
      DT::datatable(caption = "Keywords in context", rownames = FALSE, options = list(scrollX = TRUE, pageLength = 5, lengthMenu = c(5, 10, 15, 20)))
  1. Sentiment analysis Using “bing” Dictionary

Result: we have a dataframe with words, dates, sentiment and frequency of occurrence of those words first plot: we start from the general: visualize through time how sentiment changes! TO BE EDITED

4.1. Sentiment Analysis Dataframe

words_sentiments <- words_list %>% #we have 293 classified words
  filter(!word == "crisis") %>% #remove the word crisis before the sentiment analysis
      inner_join(get_sentiments("bing")) %>% 
      count(word, sentiment) 
## Joining, by = "word"
sentiment_complete <- left_join(words_list, words_sentiments, by = "word") #join with the main df
    
toks_cop26 <- cop26 %>% 
  filter(str_detect(Headlines, "Boris Johnson")|str_detect(Headlines, "Boris") | str_detect(Headlines, "Johnson"))
    
words_list_boris<- toks_cop26 %>% 
  unnest_tokens(word, Headlines, token = "regex", pattern = "\\s+|[[:punct:]]+") %>%
  anti_join(customized_stopwords) %>% 
  arrange(Dates) %>%
  count(word, sort = TRUE) %>% 
  head(10)
## Joining, by = "word"

4.2. Basic Visualization

sentiment_complete <- sentiment_complete %>% 
  mutate(across(sentiment, .fns = ~replace_na(.,"neutral"))) %>%
  mutate(across(n, .fns = ~replace_na(.,0))) 

#first plot of sentiment analysis
sentiment_complete$n[sentiment_complete$sentiment == 'negative'] <-  -sentiment_complete$n[sentiment_complete$sentiment == 'negative']


sentiment_complete %>% filter(sentiment != "neutral") %>%
  ggplot(aes(x = Dates, y = n, fill = sentiment))+
  geom_bar(stat = "identity", width = 0.5) +
  labs(title = "Sentiment frequency plot by date", y = "Count", x = "Date", fill = "sentiment") + geom_vline(xintercept= as.Date("2021-10-31"), linetype = "dashed", colour = "darkgrey") +
  geom_vline(xintercept= as.Date("2021-11-12"), linetype = "dashed", colour = "darkgrey") +
  theme_minimal()+ theme(panel.border = element_rect(color = "black",
                                    fill = NA,
                                    size = 0.3),
                         plot.background = element_rect(colour ="white"))
## Warning: Removed 12 rows containing missing values (position_stack).

ggsave("SentimentFrequencyPlot.png", width = 5, height = 3)
## Warning: Removed 12 rows containing missing values (position_stack).
  1. Exploration of Results

we try to understand which are the most used words overall. First we plot them without dates to see the overall result, then we add them in a second plot

5.1. Plot of Results

sentiment_complete %>% filter(sentiment != "neutral") %>%
  #drop_na() %>% 
  count(word, sentiment, sort = TRUE) %>% 
  group_by(sentiment) %>%
  filter(n > 4) %>% 
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ sentiment, scales = "free_y") +
  labs(title = "Most common words affecting Sentiments", x = "Frequency", y = NULL)+
  theme_minimal() + 
  theme(panel.border = element_rect(color = "black",
                                    fill = NA,
                                    size = 0.3),
        plot.background = element_rect(colour ="white"))

ggsave("MostFrequentlyOccuringWord.png", width = 5, height = 3)

5.2. Summary Interactive Plots

5.2.1. Manipulation of the Dataset into a ‘timeseries’ Object

sentiment_complete %>%
  count(Dates,sentiment) %>% 
  mutate(across(sentiment, .fns = ~replace_na(.,"neutral")))-> counter #creating a dataframe counting each type of sentiment, replacing NA sentiment by neutral

counter %>% 
  build_wider_spec(names_from = sentiment, values_from = c(n))->spec1
counter<-counter %>% 
  pivot_wider_spec(spec1) %>% 
  na.omit() # to pivot the data frame, single date per line, all sentiments per column 

negatives<-xts(counter$negative, counter$Dates) #creating a time series for each sentiment
positives<-xts(counter$positive, counter$Dates)
neutrals<-xts(counter$neutral, counter$Dates)

timeseries <- cbind(negatives, positives,neutrals) # binding the three time series

5.2.2. All Sentiment Interactive Plot by Date

5.2.3. Positive vs Negative Sentiment Interactive Plots by Date

timeseries2<-cbind(negatives, positives)
dygraph(timeseries2, main = "Positive and negative sentiments across time", xlab = "Date", ylab = "Frequency") %>%  #the plot, unfortunately, the dates are lost so far
  dySeries("negatives", label="Negative") %>% 
  dySeries("positives", label="Positive") %>% 
  dyOptions(labelsUTC = TRUE, fillGraph=TRUE, fillAlpha=0.5, drawGrid = FALSE) %>%
  dyRangeSelector() %>%
  dyCrosshair(direction = "vertical") %>%
  dyHighlight(highlightCircleSize = 3, highlightSeriesBackgroundAlpha = 0.3, hideOnMouseOut = FALSE)  %>%
  dyRoller(rollPeriod = 1)
  1. Targeted sentiment analysis

6.1 Sentiment Density for COP26

toks_c26 <- cop26$Headlines %>% 
  corpus() %>% 
  tokens()
boris_c26 <- toks_cop26$Headlines %>% 
  corpus() %>% 
  tokens()

sent_c26 <- tokens_lookup(toks_c26, dictionary = data_dictionary_LSD2015[1:2]) %>% 
  dfm() %>% 
  convert(to = "data.frame") %>% #targeted sentiment analysis for the toks_cop26 words
  mutate(Type = "Cop26")
  
sent_boris <- tokens_lookup(boris_c26, dictionary = data_dictionary_LSD2015[1:2]) %>% 
  dfm() %>% 
  convert(to = "data.frame") %>% #targeted sentiment analysis for the toks_cop26 words
  mutate(Type = "Boris")

sentiment <- rbind(sent_c26,sent_boris)
sentiment$score <- log((sentiment$positive + 0.5) / (sentiment$negative + 0.5))

sentiment %>% 
  ggplot(aes(x = score, fill = Type)) + 
  geom_density(alpha=0.3) +
  labs(title = "Distribution of Sentiments", subtitle = str_wrap('This graph plots the distribution of sentiments across all headlines, as well as the distribution of a targeted sentiment analysis around the name "Boris Johnson".', width = 85), x = "Sentiment Score", y = "Density")+
  theme_minimal() + 
  theme_minimal() +
  theme(panel.border = element_rect(color = "black",
                                    fill = NA,
                                    size = 0.3),
        plot.background = element_rect(colour ="white"))

ggsave("DensityPlots.png", width = 5, height = 3)
# I kept these but we don't need them, 
#sent_negative <- words_list %>% #we have 293 classified words, DO WE NEED THESE? THE NEGATIVES ONLY
 # filter(!word == "crisis") %>% #remove the word crisis before the sentiment analysis
  #inner_join(get_sentiments("bing")) %>% 
  #filter(sentiment=="negative")

#sent_negative %>% 
 # count(Dates, sentiment) %>% 
  #ggplot(aes(x=Dates, y=n)) +
  #geom_line(color="red") +
  #theme_minimal() +
  #ylab("Frequency of Negative Words during COP26")+
  #xlab("Date")




#I kept the ggplot on case we will drop the interactive ones
#sentiment_complete %>% 
 # drop_na() %>% 
  #count(Dates, sentiment) %>% 
  #group_by(sentiment) %>%
  #ggplot(aes(Dates,y=n,color=sentiment)) +
  #geom_line() +
  #facet_wrap(~ sentiment, scales = "free_x") +
  #theme_minimal() +
  #ylab("Frequency of Negative Words during COP26")+
  #xlab("Date")
#AFINN Analysis
word_sent_afinn <- words_list %>% #we have 293 classified words
  filter(!word == "crisis") %>% #remove the word crisis before the sentiment analysis
  inner_join(get_sentiments("afinn")) %>% # to load the AFINN dataset, you have to give permission from the console if you never uploaded it yet
  count(word, value) 
## Joining, by = "word"
sentiment_complete_afinn <- left_join(words_list, word_sent_afinn, by = "word") #join with the main df

sentiment_complete_afinn %>%
  count(Dates,value) %>% 
  mutate(across(value, .fns = ~replace_na(.,"0")))-> counter_afinn

counter_afinn$value<-as.numeric(counter_afinn$value)

counter_afinn<-counter_afinn %>% 
  na.omit() %>%
  filter(value!="0") 
counter_afinn$value<- factor(counter_afinn$value, levels= c("-5","-4","-3","-2","-1","1","2","3","4","5"))

colors <- c( "#b2182b","#d6604d", "#f4a582","#fddbc7","#d1e5f0","#92c5de","#4393c3","#2166ac") #"#2166ac","#4393c3", "#92c5de", "#d1e5f0", "#fddbc7", "#f4a582","#d6604d","#b2182b")
#copying the colors, using https://imagecolorpicker.com/
scatter_result<- plot_ly(counter_afinn, x = ~Dates, y = ~value, color = ~value, size = ~n, colors=colors ,sizes = c(10,40),
        type = 'scatter', mode = 'markers',
        marker = list(symbol = 'circle', sizemode = 'diameter',
                      line = list(width = 1, color = 'lightgrey')),
        text = ~paste('Count of Words', n), alpha = 1.2) #setting interactions


scatter_result<-scatter_result %>% layout(title = 'AFINN Dictionary Results',
         xaxis = list(title = 'Dates',
                      gridcolor = 'white',
                      range=c("2021-10-14","2021-11-20"),
                      zerolinewidth = 1,
                      ticklen = 3,
                      gridwidth = 2),
         yaxis = list(title = 'Sentiment Value',
                      gridcolor = 'lightgray',
                    
                      zerolinewidth = 1,
                      ticklen = 3,
                      gridwith = 2),
         paper_bgcolor = 'rgb(255, 255, 255)',
         plot_bgcolor = 'rgb(255,255,255)')
scatter_result
## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.
#Afinn Analysis Boris
toks_cop26_boris<-toks_cop26 %>% 
  unnest_tokens(word, Headlines)

word_sent_afinn_boris <- toks_cop26_boris %>% 
  filter(!word == "crisis") %>% #remove the word crisis before the sentiment analysis
  inner_join(get_sentiments("afinn")) %>% 
  count(word, value) 
## Joining, by = "word"
sentiment_boris_afinn <- left_join(words_list, word_sent_afinn_boris, by = "word") #join with the main df

sentiment_boris_afinn %>%
  count(Dates,value) %>% 
  mutate(across(value, .fns = ~replace_na(.,"0")))-> counter_boris_afinn

counter_boris_afinn$value<-as.numeric(counter_boris_afinn$value)

counter_boris_afinn<-counter_boris_afinn %>% 
  na.omit() %>%
  filter(value!="0") 
counter_boris_afinn$value<- factor(counter_boris_afinn$value, levels= c("-5","-4","-3","-2","-1","1","2","3","4","5"))

colors <- c( "#b2182b","#d6604d", "#f4a582","#fddbc7","#d1e5f0","#92c5de","#4393c3","#2166ac")
#copying the colors, using https://imagecolorpicker.com/
scatter_result_boris<- plot_ly(counter_boris_afinn, x = ~Dates, y = ~value, color = ~value, size = ~n, colors=colors ,sizes = c(10,40),
        type = 'scatter', mode = 'markers',
        marker = list(symbol = 'circle', sizemode = 'diameter',
                      line = list(width = 1, color = 'lightgrey')),
        text = ~paste('Count of Words', n), alpha = 1.2) #setting interactions


scatter_result_boris<-scatter_result_boris %>% layout(title = 'AFINN Dictionary Results for Boris Headlines',
         xaxis = list(title = 'Dates',
                      gridcolor = 'white',
                      range=c("2021-10-14","2021-11-20"),
                      zerolinewidth = 1,
                      ticklen = 3,
                      gridwidth = 2),
         yaxis = list(title = 'Sentiment Value',
                      gridcolor = 'lightgray',
                    
                      zerolinewidth = 1,
                      ticklen = 3,
                      gridwith = 2),
         paper_bgcolor = 'rgb(255, 255, 255)',
         plot_bgcolor = 'rgb(255,255,255)')
scatter_result_boris
## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.